setwd("~/Desktop/working-with-lyle/formality_project")
if (!require("pacman")) install.packages("pacman") #run this if you don't have pacman
library(pacman)
pacman::p_load(tidyverse,rlang, zoo, lubridate, plotrix, ggpubr, caret, broom, kableExtra, reactable, install = T)
#use pacman to load packages quickly palette_map = c("#3B9AB2", "#EBCC2A", "#F21A00")
palette_condition = c("#ee9b00", "#bb3e03", "#005f73")
plot_aes = theme_classic() +
theme(legend.position = "top",
legend.text = element_text(size = 12),
text = element_text(size = 16, family = "Futura Medium"),
axis.text = element_text(color = "black"),
axis.line = element_line(colour = "black"),
axis.ticks.y = element_blank()) table_model = function(model_data,reference = "Intercept") {
model_data %>%
tidy() %>%
rename("SE" = std.error,
"t" = statistic,
"p" = p.value) %>%
mutate(term = gsub("\\(Intercept\\)", !!reference, term),
term = gsub("Date", "Original Publication Date", term)) %>%
kable() %>%
kableExtra::kable_styling()
}df <- read_csv('https://raw.githubusercontent.com/scm1210/Language_Lab_Repro/main/Atlantic_Cleaned_all_vars.csv') #read in the data
#screen outliers
df[,c("Analytic_scaled", "WPS_scaled", "BigWords_scaled","Period_scaled","readability_scaled","grade_level_scaled",'i_scaled','we_scaled','pronoun_scaled','article_scaled','cogproc_scaled','Apostro_scaled',"Conversation_scaled", 'det_scaled','syllables_per_word_scaled','syllables_per_sentence_scaled')] <- lapply(df[,c("Analytic","WPS","BigWords","Period","readability","grade_level",'i','we','pronoun','article','cogproc','Apostro',"Conversation",'det','syllables_per_word','syllables_per_sentence')], scale)
df <- subset(df, abs(Analytic_scaled) <= 3 & abs(WPS_scaled) <= 3 & abs(BigWords_scaled) <= 3
& abs(Period_scaled) <= 3 & abs(readability_scaled) <= 3 & abs(grade_level_scaled) <= 3 & abs(i_scaled)
<= 3 & abs(we_scaled) <= 3 & abs(pronoun_scaled) <= 3 & abs(article_scaled) <= 3 & abs(cogproc_scaled) <= 3 & abs(Apostro_scaled) & abs(Conversation_scaled) & abs(det_scaled))
df <- df %>% filter(readability<=120) %>% #filter out impossible values
filter(readability>=0) %>%
filter(grade_level>=0) %>%
filter(grade_level<=18) %>%
filter(Period>0) %>%
filter(Period<=20) %>%
filter(WPS<145)Flesch-Kincaid Ease of Readability: higher scores indicate material that is easier to read; lower numbers mark passages that are more difficult to read.
The Flesch–Kincaid Grade Level Score: presents a score as a U.S. grade level, making it easier for teachers, parents, librarians, and others to judge the readability level of various books and texts.
The following corpus consists of 42,528 articles ranging from 1857 to 2022.
df %>%
select(Date) %>%
range()## [1] 1857 2022
Number arrived at after filtering out outliers and duplicates
df %>%
select(Filename) %>%
dplyr::summarize(n = n()) %>%
reactable::reactable(striped = TRUE)articles_year <- df %>%
select(Filename,Date) %>%
unique() %>%
group_by(Date) %>%
dplyr::summarize(n = n()) %>%
reactable::reactable(striped = TRUE)
articles_yearPlease see attached files for the graphs if needed.
readability_smooth_tidy <- ggplot(data=tidy_df, aes(x=Date, y=readability_mean, group=1)) +
ggtitle("Readability") +
geom_point(color = "dodgerblue3", alpha = 0.7) +
geom_smooth(method = "loess", span = 0.60 )+
plot_aes +
labs(x = "Year", y = 'Ease of Readability') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold"))
grade_smooth_tidy <- ggplot(data=tidy_df, aes(x=Date, y=grade_level_mean, group=1)) +
ggtitle("Grade Level") +
geom_point(color = "dodgerblue3", alpha = 0.7) +
geom_smooth(method = "loess", span = 0.80 )+
plot_aes +
labs(x = "Year", y = 'Grade Level Score') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold"))
syllables_per_word_smooth_tidy <- ggplot(data=tidy_df, aes(x=Date, y=syllables_per_word_mean, group=1)) +
ggtitle("Syllables per word") +
geom_point(color = "dodgerblue3", alpha = 0.7) +
geom_smooth(method = "loess", span = 0.80 )+
plot_aes +
labs(x = "Year", y = 'Syllables per word') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold"))
syllables_per_sentence_smooth_tidy <- ggplot(data=tidy_df, aes(x=Date, y=syllables_per_sentence_mean, group=1)) +
ggtitle("Syllables per sentence") +
geom_point(color = "dodgerblue3", alpha = 0.7) +
geom_smooth(method = "loess", span = 0.80 )+
plot_aes +
labs(x = "Year", y = 'Syllables per sentence') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold"))
syllables_per_word_smooth_tidy <- ggplot(data=tidy_df, aes(x=Date, y=syllables_per_word_mean, group=1)) +
ggtitle("Syllables per word") +
geom_point(color = "dodgerblue3", alpha = 0.7) +
geom_smooth(method = "loess", span = 0.80 )+
plot_aes +
labs(x = "Year", y = 'Syllables per word') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold"))
tidy_smooth_graphs <- ggpubr::ggarrange(readability_smooth_tidy,grade_smooth_tidy,
syllables_per_word_smooth_tidy,
ncol=1, nrow=3, common.legend = TRUE, legend = "bottom")
annotate_figure(tidy_smooth_graphs,
top = text_grob("Atlantic Flesch-Kincaid and Syllables", color = "black", face = "bold", size = 20))readability_smooth_tidy grade_smooth_tidysyllables_per_sentence_smooth_tidysyllables_per_word_smooth_tidyModel presented is centered on means for first year in the dataset.
#Centered
Readability_centered <- lm(readability_centered ~ Date, data = tidy_df)
table_model(Readability_centered)| term | estimate | SE | t | p |
|---|---|---|---|---|
| Intercept | 46.4160 | 4.9999 | 9.283 | 0 |
| Original Publication Date | -0.0261 | 0.0026 | -10.144 | 0 |
Model presented is centered on means for first year in the dataset.
#Centered
Grade_centered <- lm(grade_level_centered ~ Date, data = tidy_df)
table_model(Grade_centered)| term | estimate | SE | t | p |
|---|---|---|---|---|
| Intercept | 6.7569 | 1.1769 | 5.741 | 0 |
| Original Publication Date | -0.0033 | 0.0006 | -5.383 | 0 |
syllables_per_word_centered <- lm(syllables_per_word_centered ~ Date, data = tidy_df)
table_model(syllables_per_word_centered)| term | estimate | SE | t | p |
|---|---|---|---|---|
| Intercept | -1.1875 | 0.0594 | -19.99 | 0 |
| Original Publication Date | 0.0006 | 0.0000 | 21.00 | 0 |